# Necessary Imports
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
from collections import defaultdict
import sklearn
from sklearn.datasets import fetch_olivetti_faces
from sklearn.metrics import accuracy_score, silhouette_score, classification_report
from sklearn.svm import SVC
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.model_selection import train_test_split, cross_val_score
import warnings
# Remove annoying alerts
warnings.filterwarnings('ignore')
# Fetch Olivetti dataset from Sklearn
dataset = fetch_olivetti_faces(shuffle=True, random_state=98)
type(dataset)
sklearn.utils._bunch.Bunch
# Storing features, target variable, and 2d features matrix as images
X = dataset.data
y = dataset.target
images = dataset.images
type(images)
numpy.ndarray
images.shape
(400, 64, 64)
# Bundle X and y into a dataframe
pixel_columns = [f"pixel_{i}" for i in range(1, X.shape[1] + 1)]
df = pd.DataFrame(X, columns=pixel_columns)
df['target'] = y
df.head()
| pixel_1 | pixel_2 | pixel_3 | pixel_4 | pixel_5 | pixel_6 | pixel_7 | pixel_8 | pixel_9 | pixel_10 | ... | pixel_4088 | pixel_4089 | pixel_4090 | pixel_4091 | pixel_4092 | pixel_4093 | pixel_4094 | pixel_4095 | pixel_4096 | target | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.086777 | 0.099174 | 0.115702 | 0.128099 | 0.214876 | 0.359504 | 0.512397 | 0.603306 | 0.652893 | 0.702479 | ... | 0.487603 | 0.479339 | 0.466942 | 0.450413 | 0.454545 | 0.454545 | 0.210744 | 0.318182 | 0.491736 | 37 |
| 1 | 0.260331 | 0.351240 | 0.438017 | 0.553719 | 0.648760 | 0.694215 | 0.747934 | 0.789256 | 0.809917 | 0.830579 | ... | 0.541322 | 0.516529 | 0.520661 | 0.326446 | 0.074380 | 0.223140 | 0.256198 | 0.309917 | 0.289256 | 7 |
| 2 | 0.103306 | 0.219008 | 0.177686 | 0.219008 | 0.392562 | 0.574380 | 0.669421 | 0.681818 | 0.710744 | 0.731405 | ... | 0.367769 | 0.433884 | 0.421488 | 0.425620 | 0.429752 | 0.429752 | 0.438017 | 0.475207 | 0.276859 | 3 |
| 3 | 0.669421 | 0.636364 | 0.648760 | 0.685950 | 0.710744 | 0.760331 | 0.768595 | 0.805785 | 0.793388 | 0.809917 | ... | 0.223140 | 0.219008 | 0.115702 | 0.090909 | 0.090909 | 0.095041 | 0.086777 | 0.082645 | 0.074380 | 13 |
| 4 | 0.425620 | 0.475207 | 0.458678 | 0.500000 | 0.524793 | 0.524793 | 0.541322 | 0.557851 | 0.586777 | 0.553719 | ... | 0.491736 | 0.690083 | 0.661157 | 0.669421 | 0.524793 | 0.433884 | 0.491736 | 0.483471 | 0.483471 | 33 |
5 rows × 4097 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 400 entries, 0 to 399 Columns: 4097 entries, pixel_1 to target dtypes: float32(4096), int32(1) memory usage: 6.3 MB
# Define a function to plot (default = 40) sample images
def plot_gallery(images, titles, h, w, n_row=5, n_col=8):
"""Helper function to plot a gallery of portraits"""
plt.figure(figsize=(1.8 * n_col, 2.4 * n_row))
plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35)
for i in range(n_row * n_col):
plt.subplot(n_row, n_col, i + 1)
plt.imshow(images[i].reshape((h, w)), cmap=plt.cm.gray)
plt.title(titles[i], size=12)
plt.xticks(())
plt.yticks(())
plot_gallery(images, y, h=64, w=64)
plt.show()
# Split dataset into train, validation, and test sets with stratification
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=98, stratify=y)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=98, stratify=y_temp)
print(f"Training set size: {len(y_train)}")
print(f"Validation set size: {len(y_valid)}")
print(f"Test set size: {len(y_test)}")
Training set size: 280 Validation set size: 60 Test set size: 60
# Instatiate a Support Vector Classifier
svm_clf = SVC(kernel='rbf', random_state=98)
# Get 5-fold cross validation scores
k = 5
scores = cross_val_score(svm_clf, X_train, y_train, cv=k, scoring='accuracy')
print(f"Cross-validation scores (k={k}):", scores)
print("Average cross-validation score:", scores.mean())
Cross-validation scores (k=5): [0.85714286 0.92857143 0.92857143 0.83928571 0.85714286] Average cross-validation score: 0.8821428571428571
# Train the SVC classifier
svm_clf.fit(X_train, y_train)
SVC(random_state=98)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SVC(random_state=98)
# Make predictions and print validation scores on the validation set
y_pred_valid = svm_clf.predict(X_valid)
accuracy = accuracy_score(y_valid, y_pred_valid)
print(f"Validation accuracy with kernel= rbf:", accuracy)
Validation accuracy with kernel= rbf: 0.9333333333333333
# Make predictions on the test set
y_pred = svm_clf.predict(X_test)
# Print the classification report
print('\t\tClassification Report - SVC\n\n', classification_report(y_test, y_pred))
Classification Report - SVC
precision recall f1-score support
0 1.00 1.00 1.00 1
1 1.00 1.00 1.00 2
2 0.00 0.00 0.00 2
3 1.00 0.50 0.67 2
4 1.00 0.50 0.67 2
5 1.00 1.00 1.00 1
6 1.00 1.00 1.00 2
7 0.67 1.00 0.80 2
8 0.50 1.00 0.67 1
9 1.00 0.50 0.67 2
10 1.00 1.00 1.00 1
11 1.00 1.00 1.00 1
12 1.00 1.00 1.00 1
13 1.00 1.00 1.00 1
14 1.00 1.00 1.00 1
15 1.00 1.00 1.00 2
16 1.00 1.00 1.00 1
17 1.00 1.00 1.00 1
18 1.00 1.00 1.00 1
19 1.00 1.00 1.00 1
20 1.00 1.00 1.00 1
21 1.00 1.00 1.00 2
22 0.25 1.00 0.40 1
23 1.00 1.00 1.00 2
24 1.00 1.00 1.00 1
25 1.00 1.00 1.00 2
26 1.00 1.00 1.00 2
27 1.00 1.00 1.00 2
28 1.00 1.00 1.00 1
29 1.00 1.00 1.00 2
30 1.00 1.00 1.00 2
31 1.00 1.00 1.00 2
32 1.00 1.00 1.00 1
33 1.00 1.00 1.00 2
34 1.00 1.00 1.00 2
35 1.00 1.00 1.00 2
36 1.00 1.00 1.00 1
37 0.00 0.00 0.00 1
38 1.00 1.00 1.00 1
39 0.33 0.50 0.40 2
accuracy 0.88 60
macro avg 0.89 0.90 0.88 60
weighted avg 0.90 0.88 0.87 60
# Explore the target variables further
np.unique(y, return_counts=True)
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
34, 35, 36, 37, 38, 39]),
array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
10, 10, 10, 10, 10, 10], dtype=int64))
scores = []
range_clusters = range(2, 200)
# Fit kmeans with a range of clusters an make predictions
for n_clusters in range_clusters:
kmeans = KMeans(n_clusters=n_clusters, init="k-means++", n_init='auto', random_state=98)
kmeans.fit(X_train)
preds = kmeans.predict(X_train)
score = silhouette_score(X_train, preds)
scores.append(score)
# Get the silhouette score for each iteration in the for loop above
for i, value in enumerate(scores):
print(f'Index {i} : {value}')
Index 0 : 0.15040580928325653 Index 1 : 0.12763933837413788 Index 2 : 0.10531938076019287 Index 3 : 0.10427547246217728 Index 4 : 0.10007187724113464 Index 5 : 0.08305481821298599 Index 6 : 0.08263854682445526 Index 7 : 0.07518859207630157 Index 8 : 0.073173888027668 Index 9 : 0.08468463271856308 Index 10 : 0.09262099862098694 Index 11 : 0.09107372164726257 Index 12 : 0.09361782670021057 Index 13 : 0.09615043550729752 Index 14 : 0.08599311113357544 Index 15 : 0.08935511112213135 Index 16 : 0.09138374030590057 Index 17 : 0.10017868131399155 Index 18 : 0.10079923272132874 Index 19 : 0.10017043352127075 Index 20 : 0.10201352834701538 Index 21 : 0.10470715910196304 Index 22 : 0.10446251928806305 Index 23 : 0.10010135918855667 Index 24 : 0.10107807070016861 Index 25 : 0.10394089668989182 Index 26 : 0.10997769236564636 Index 27 : 0.11223439127206802 Index 28 : 0.11691679060459137 Index 29 : 0.11985310167074203 Index 30 : 0.12723535299301147 Index 31 : 0.13064169883728027 Index 32 : 0.13365775346755981 Index 33 : 0.13824263215065002 Index 34 : 0.14249515533447266 Index 35 : 0.1481761634349823 Index 36 : 0.14357256889343262 Index 37 : 0.1436309516429901 Index 38 : 0.1397218406200409 Index 39 : 0.1467689722776413 Index 40 : 0.14565765857696533 Index 41 : 0.1496649533510208 Index 42 : 0.1529509425163269 Index 43 : 0.1521298885345459 Index 44 : 0.15335075557231903 Index 45 : 0.14976109564304352 Index 46 : 0.15099528431892395 Index 47 : 0.15427374839782715 Index 48 : 0.15273474156856537 Index 49 : 0.15652386844158173 Index 50 : 0.15424807369709015 Index 51 : 0.15666350722312927 Index 52 : 0.15177112817764282 Index 53 : 0.16995060443878174 Index 54 : 0.1676616668701172 Index 55 : 0.1686926633119583 Index 56 : 0.16975824534893036 Index 57 : 0.1662697046995163 Index 58 : 0.1699931025505066 Index 59 : 0.1718224734067917 Index 60 : 0.17366155982017517 Index 61 : 0.17582568526268005 Index 62 : 0.1759594827890396 Index 63 : 0.17405714094638824 Index 64 : 0.17558467388153076 Index 65 : 0.17994172871112823 Index 66 : 0.18225790560245514 Index 67 : 0.18075674772262573 Index 68 : 0.17639149725437164 Index 69 : 0.17577309906482697 Index 70 : 0.1748947650194168 Index 71 : 0.1761694699525833 Index 72 : 0.17674699425697327 Index 73 : 0.17763976752758026 Index 74 : 0.17831090092658997 Index 75 : 0.17750287055969238 Index 76 : 0.18052951991558075 Index 77 : 0.18284288048744202 Index 78 : 0.18246963620185852 Index 79 : 0.18181942403316498 Index 80 : 0.18212315440177917 Index 81 : 0.1808498352766037 Index 82 : 0.17763452231884003 Index 83 : 0.1776471585035324 Index 84 : 0.17787683010101318 Index 85 : 0.17529632151126862 Index 86 : 0.17517326772212982 Index 87 : 0.17716245353221893 Index 88 : 0.17552055418491364 Index 89 : 0.17928534746170044 Index 90 : 0.17951925098896027 Index 91 : 0.1787957400083542 Index 92 : 0.17868363857269287 Index 93 : 0.1791733354330063 Index 94 : 0.17808859050273895 Index 95 : 0.17738115787506104 Index 96 : 0.177434504032135 Index 97 : 0.17786002159118652 Index 98 : 0.18246525526046753 Index 99 : 0.1808396726846695 Index 100 : 0.17735245823860168 Index 101 : 0.1762050837278366 Index 102 : 0.17509391903877258 Index 103 : 0.17494116723537445 Index 104 : 0.17638462781906128 Index 105 : 0.17514196038246155 Index 106 : 0.1750955581665039 Index 107 : 0.16826532781124115 Index 108 : 0.16866986453533173 Index 109 : 0.16804738342761993 Index 110 : 0.1680053174495697 Index 111 : 0.1669185906648636 Index 112 : 0.16684848070144653 Index 113 : 0.16438926756381989 Index 114 : 0.16455718874931335 Index 115 : 0.1647680401802063 Index 116 : 0.16489136219024658 Index 117 : 0.16405563056468964 Index 118 : 0.16235491633415222 Index 119 : 0.1630207598209381 Index 120 : 0.1625487059354782 Index 121 : 0.16416308283805847 Index 122 : 0.16108065843582153 Index 123 : 0.16164664924144745 Index 124 : 0.16370269656181335 Index 125 : 0.16323113441467285 Index 126 : 0.162475124001503 Index 127 : 0.15978407859802246 Index 128 : 0.1603734791278839 Index 129 : 0.1608683466911316 Index 130 : 0.15711955726146698 Index 131 : 0.15514495968818665 Index 132 : 0.15438158810138702 Index 133 : 0.15295284986495972 Index 134 : 0.15218976140022278 Index 135 : 0.151565819978714 Index 136 : 0.1495165079832077 Index 137 : 0.15253356099128723 Index 138 : 0.15231600403785706 Index 139 : 0.15198323130607605 Index 140 : 0.15227250754833221 Index 141 : 0.1531990021467209 Index 142 : 0.15380799770355225 Index 143 : 0.15116570889949799 Index 144 : 0.15025781095027924 Index 145 : 0.15048879384994507 Index 146 : 0.14884935319423676 Index 147 : 0.17412197589874268 Index 148 : 0.16910052299499512 Index 149 : 0.1681765466928482 Index 150 : 0.16596214473247528 Index 151 : 0.1649814397096634 Index 152 : 0.1641160547733307 Index 153 : 0.16402922570705414 Index 154 : 0.1618376523256302 Index 155 : 0.15940622985363007 Index 156 : 0.1595630794763565 Index 157 : 0.15751375257968903 Index 158 : 0.1581299901008606 Index 159 : 0.15863680839538574 Index 160 : 0.15741732716560364 Index 161 : 0.15688654780387878 Index 162 : 0.15663589537143707 Index 163 : 0.1536678522825241 Index 164 : 0.1506621241569519 Index 165 : 0.15042011439800262 Index 166 : 0.14912308752536774 Index 167 : 0.14985905587673187 Index 168 : 0.14700503647327423 Index 169 : 0.1476399004459381 Index 170 : 0.14566178619861603 Index 171 : 0.14340174198150635 Index 172 : 0.14179828763008118 Index 173 : 0.14100085198879242 Index 174 : 0.14016954600811005 Index 175 : 0.1385919600725174 Index 176 : 0.1381639540195465 Index 177 : 0.13688717782497406 Index 178 : 0.13773921132087708 Index 179 : 0.13580232858657837 Index 180 : 0.13455404341220856 Index 181 : 0.13365940749645233 Index 182 : 0.13127896189689636 Index 183 : 0.1298736333847046 Index 184 : 0.12892693281173706 Index 185 : 0.12756234407424927 Index 186 : 0.12827464938163757 Index 187 : 0.12666037678718567 Index 188 : 0.1267896443605423 Index 189 : 0.12822739779949188 Index 190 : 0.12558528780937195 Index 191 : 0.12391166388988495 Index 192 : 0.12362907826900482 Index 193 : 0.12385866791009903 Index 194 : 0.12379568815231323 Index 195 : 0.12335821241140366 Index 196 : 0.12285757064819336 Index 197 : 0.1215449869632721
# Get the highest silhouette score
best_n_clusters = range_clusters[scores.index(max(scores))]
best_n_clusters
79
best_n_scores = scores[scores.index(max(scores))]
best_n_scores
0.18284288
scores[77]
0.18284288
plt.figure(figsize=(24, 8))
plt.plot(range_clusters, scores, "bo-")
plt.xlabel("$k$", fontsize=14)
plt.ylabel("Silhouette score", fontsize=14)
plt.axis([1.5, 120, 0.05, 0.5])
plt.show()
# Reduce the dataset dimensionality according to the number of clusters that returned the highest silhouette score
kmeans = KMeans(n_clusters=79, init="k-means++", n_init='auto', random_state=98)
X_reduced = kmeans.fit_transform(X)
X_reduced.shape
(400, 79)
# Array with the first instance's distances to the centroids
X_reduced[0]
array([11.988758 , 11.11365 , 15.363498 , 9.444666 , 11.000386 ,
5.4597178, 9.867196 , 8.780561 , 12.209009 , 9.978074 ,
14.158886 , 14.16626 , 11.892452 , 8.184852 , 7.400579 ,
11.166139 , 11.0718565, 12.919109 , 13.880829 , 11.196977 ,
8.596896 , 11.290294 , 9.71953 , 9.962675 , 14.4639015,
13.494672 , 13.580415 , 9.469277 , 11.30324 , 11.732554 ,
11.46953 , 9.1103525, 10.009584 , 10.687915 , 10.7051325,
10.389788 , 9.519195 , 10.605962 , 10.743815 , 9.234018 ,
11.074283 , 13.077244 , 13.626669 , 8.276454 , 14.109044 ,
12.779579 , 13.597022 , 9.647831 , 8.340371 , 6.6628537,
11.615242 , 13.948507 , 9.404628 , 8.8362 , 13.465821 ,
9.559306 , 11.116907 , 10.479551 , 11.788693 , 9.9946785,
11.711618 , 9.873837 , 9.338231 , 11.875794 , 13.702015 ,
9.150525 , 8.512709 , 16.701874 , 11.092509 , 12.574738 ,
8.481403 , 11.284654 , 10.599734 , 9.639815 , 9.883467 ,
12.434465 , 11.363939 , 8.173353 , 9.145894 ], dtype=float32)
# Split the reduced Olivetti dataset
X_train_reduced, X_temp_reduced, y_train, y_temp = train_test_split(X_reduced, y, test_size=0.3, random_state=98, stratify=y)
X_valid_reduced, X_test_reduced, y_valid, y_test = train_test_split(X_temp_reduced, y_temp, test_size=0.5, random_state=98, stratify=y_temp)
print(f"Training set size: {len(y_train)}")
print(f"Validation set size: {len(y_valid)}")
print(f"Test set size: {len(y_test)}")
Training set size: 280 Validation set size: 60 Test set size: 60
# Retrain the classifier with the reduced dataset
svm_clf.fit(X_train_reduced, y_train)
SVC(random_state=98)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SVC(random_state=98)
k = 5
scores = cross_val_score(svm_clf, X_train_reduced, y_train, cv=k, scoring='accuracy')
print(f"Cross-validation scores (k={k}):", scores)
print("Average cross-validation score:", scores.mean())
Cross-validation scores (k=5): [0.55357143 0.57142857 0.67857143 0.46428571 0.67857143] Average cross-validation score: 0.5892857142857142
y_pred_valid_reduced = svm_clf.predict(X_valid_reduced)
accuracy = accuracy_score(y_valid, y_pred_valid_reduced)
print(f"Validation accuracy with kernel= rbf:", accuracy)
Validation accuracy with kernel= rbf: 0.65
y_pred_reduced = svm_clf.predict(X_test_reduced)
# Print the classification report
print('\t\tClassification Report - SVC\n\n', classification_report(y_test, y_pred_reduced))
Classification Report - SVC
precision recall f1-score support
0 0.50 1.00 0.67 1
1 1.00 1.00 1.00 2
2 0.00 0.00 0.00 2
3 0.00 0.00 0.00 2
4 1.00 0.50 0.67 2
5 0.00 0.00 0.00 1
6 0.67 1.00 0.80 2
7 1.00 1.00 1.00 2
8 0.33 1.00 0.50 1
9 0.00 0.00 0.00 2
10 0.50 1.00 0.67 1
11 0.00 0.00 0.00 1
12 0.00 0.00 0.00 1
13 0.50 1.00 0.67 1
14 0.50 1.00 0.67 1
15 0.00 0.00 0.00 2
16 0.00 0.00 0.00 1
17 0.33 1.00 0.50 1
18 1.00 1.00 1.00 1
19 1.00 1.00 1.00 1
20 0.00 0.00 0.00 1
21 1.00 1.00 1.00 2
22 0.00 0.00 0.00 1
23 1.00 1.00 1.00 2
24 0.33 1.00 0.50 1
25 1.00 1.00 1.00 2
26 1.00 1.00 1.00 2
27 1.00 1.00 1.00 2
28 0.00 0.00 0.00 1
29 0.67 1.00 0.80 2
30 1.00 1.00 1.00 2
31 1.00 0.50 0.67 2
32 1.00 1.00 1.00 1
33 1.00 1.00 1.00 2
34 0.67 1.00 0.80 2
35 1.00 0.50 0.67 2
36 1.00 1.00 1.00 1
37 0.00 0.00 0.00 1
38 0.50 1.00 0.67 1
39 0.00 0.00 0.00 2
accuracy 0.65 60
macro avg 0.54 0.64 0.56 60
weighted avg 0.59 0.65 0.59 60
X.shape
(400, 4096)
X[:10]
array([[0.08677686, 0.09917355, 0.11570248, ..., 0.2107438 , 0.3181818 ,
0.49173555],
[0.2603306 , 0.35123968, 0.43801653, ..., 0.25619835, 0.30991736,
0.2892562 ],
[0.10330579, 0.21900827, 0.17768595, ..., 0.43801653, 0.4752066 ,
0.2768595 ],
...,
[0.48347107, 0.446281 , 0.46280992, ..., 0.661157 , 0.6735537 ,
0.6694215 ],
[0.35123968, 0.4338843 , 0.553719 , ..., 0.1570248 , 0.2107438 ,
0.20661157],
[0.7107438 , 0.7107438 , 0.70247936, ..., 0.553719 , 0.30991736,
0.30991736]], dtype=float32)
# Testing the distance and min_samples manually
dbscan = DBSCAN(eps = 7.24, min_samples=2)
clusters = dbscan.fit_predict(X)
print(np.unique(clusters))
[-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45]
print(len(np.unique(clusters)))
47
#Outliers
print(len(clusters[clusters == -1]))
34
images = X.reshape(-1, 64, 64)
images.shape
(400, 64, 64)
clustered_images = defaultdict(list)
for i, cluster in enumerate(clusters):
clustered_images[cluster].append(images[i])
def display_images(images, title=""):
n_images = len(images)
rows = int(n_images**0.5)
cols = (n_images // rows) + (n_images % rows)
plt.figure(figsize=(1.5*cols, 1.5*rows))
for i in range(n_images):
ax = plt.subplot(rows, cols, i + 1)
plt.imshow(images[i], cmap="gray")
ax.axis('off')
plt.suptitle(title)
plt.show()
for cluster, images in clustered_images.items():
display_images(images, title=f"Cluster {cluster}")